setwd(home_dir)
setwd(data_dir)
shots <- read.csv("data.csv", stringsAsFactors = FALSE)
setwd(home_dir)
names(shots) <- tolower(names(shots))
for (i in 2:(length(shots)))
{
if (class(shots[,i]) == "character")
{
shots[,i] <- factor (shots[,i])
}
}
In the National Basketball Association (NBA), the court is 94 by 50 feet
The NBA adopted the three-point line at the start of the 1979–80 season. This is of variable distance, ranging from 22 feet (6.7 m) in the corners to 23.75 feet (7.24 m) behind the top of the key.
Kobe Bean Bryant is an American retired professional basketball player and businessman. He played his entire 20-year career with the Los Angeles Lakers of the National Basketball Association (NBA). He entered the NBA directly from high school and won five NBA championships with the Lakers.
Playing career 1996–2016
Career statistics
Points 33,643 (25.0 ppg)
Rebounds 7,047 (5.2 rpg)
Assists 6,306 (4.7 apg)
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... remove outliers ... more than 5 sigma from mean value
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
lst <- length(shots) #
for (i in 1 : lst)
{
if(class(shots[,i]) == "integer" || class(shots[,i]) == "numeric")
{
shots[,i][which(scale(shots[,i]) > 5)] <- NA
shots[,i][which(scale(shots[,i]) < -5)] <- NA
}
}
# ... add a tet descriptor to correspond to shot_made_flag
# ... - just to make plots easier to understand
shots$shot_good <- NA
good_lst <- shots$shot_made_flag == 1
shots$shot_good[good_lst] <- "good"
no_good_lst <- shots$shot_made_flag == 0
shots$shot_good[no_good_lst] <- "no_good"
# ... seconds_remaining is seconds remaining in the current minute
# ... minutes_remaining is minutes remaining in the period
# ... there are 4 (regular time) periods in a match, each of 12 minutes
# ... create new vairable of time remaining in the match
# ... - creates negative values for overtime periods
shots$time_remaining <- shots$seconds_remaining +
shots$minutes_remaining * 60 +
(4 - shots$period) * 12 * 60
# ... home / away games can be determined from 'matchup' field
# ... @ - designates away game
# ... vs. - designates home game
shots$home_away <- "home"
away_lst <- grep("@", shots$matchup, perl=TRUE, value=FALSE)
shots$home_away[away_lst] <- "away"
# ... shot distances appear to be in feet * 10 ... just convert to feet
shots$x_ft <- shots$loc_x / 10
shots$y_ft <- shots$loc_y / 10
shots$int_x_ft <- as.integer(shots$x_ft)
shots$int_y_ft <- as.integer(shots$y_ft)
# ... add polar coordinate - from basket to shot point
shots$rad <- shots$shot_distance
shots$ang <- atan2(shots$x_ft, shots$y_ft) * 180/pi
shots$int_ang <- as.integer(shots$ang)
shots$int_rad <- as.integer(shots$rad)
# ... calculate total points scored in this data set
shots$pts_scored <- 2
three_pt_lst <- grep("3PT", shots$shot_type, perl=TRUE, value=FALSE)
shots$pts_scored[three_pt_lst] <- 3
shots$pts_scored <- shots$pts_scored * shots$shot_made_flag
total_pts_scored <- sum(shots$pts_scored, na.rm = TRUE)
# ... add date as date datatype
shots$date <- ymd(shots$game_date)
# ... split season into season year (integer)
shots$int_season <- as.integer(sub("-.*", "", shots$season))
avg_per_game <- aggregate(shot_made_flag ~ date, shots, mean)
avg_per_year <- aggregate(shot_made_flag ~ int_season, shots, mean)
avg_per_time_remain <- aggregate(shot_made_flag ~ time_remaining, shots, mean)
avg_per_angle <- aggregate(shot_made_flag ~ int_ang, shots, mean)
avg_per_ang_rad <- aggregate(shot_made_flag ~ int_ang + int_rad, shots, mean)
p <- ggplot(data = avg_per_ang_rad,
aes(x = int_ang, y = int_rad,
fill = shot_made_flag, color = shot_made_flag)) +
scale_colour_gradientn(colours = rev(rainbow(4)),
breaks = seq(0, 1, by = 0.10)) +
geom_point()
p
avg_per_x_y <- aggregate(shot_made_flag ~ int_x_ft + int_y_ft, shots, mean)
p <- ggplot(data = avg_per_x_y,
aes(x = int_x_ft, y = int_y_ft,
color = shot_made_flag)) +
scale_fill_distiller(palette = "Spectral") +
geom_point(shape = 15, size = 2.5) +
theme_dark()
p
avg_by_home_away <- aggregate(shot_made_flag ~ home_away + int_season, shots, mean)
avg_by_playoffs <- aggregate(shot_made_flag ~ int_season + playoffs, shots, mean)
# scale_colour_gradientn(colours = rev(rainbow(4)),
# breaks = seq(0, 1, by = 0.20)) +
# ... team_id is a constant value for all rows
# ... team_name is always Los Angeles Lakers
# ... game_event_id ?? is this needed <- drop for now
# ... game_id ?? is this needed ? <- drop for now
shots <- subset(shots, select = -c(team_id, team_name, game_event_id, game_id))
summary_tbl <- data.frame(x = character(0), stats = character(0))
for (i in 2 : length(shots))
{
if(class(shots[,i]) == "integer" || class(shots[,i]) == "numeric")
{
new_row <- data.frame(x = names(shots[i]),
stats = sprintf (
"| %8d | %8d | %8.1f | %8.1f | %8.1f | %8.1f | %8.3f | ",
colSums(!is.na(shots[i])),
(dim(shots)[1] - colSums(!is.na(shots[i]))),
mean(shots[,i], na.rm = TRUE),
median(shots[,i], na.rm = TRUE),
max(shots[,i], na.rm = TRUE),
min(shots[,i], na.rm = TRUE),
skewness(shots[,i], na.rm = TRUE)
)
)
summary_tbl <- rbind(summary_tbl, new_row)
}
}
summary_tbl
## x
## 1 lat
## 2 loc_x
## 3 loc_y
## 4 lon
## 5 minutes_remaining
## 6 period
## 7 playoffs
## 8 seconds_remaining
## 9 shot_distance
## 10 shot_made_flag
## 11 shot_id
## 12 time_remaining
## 13 x_ft
## 14 y_ft
## 15 int_x_ft
## 16 int_y_ft
## 17 rad
## 18 ang
## 19 int_ang
## 20 int_rad
## 21 pts_scored
## 22 int_season
## stats
## 1 | 30655 | 42 | 34.0 | 34.0 | 34.1 | 33.5 | -0.559 |
## 2 | 30697 | 0 | 7.1 | 0.0 | 248.0 | -250.0 | -0.085 |
## 3 | 30655 | 42 | 90.4 | 74.0 | 528.0 | -44.0 | 0.559 |
## 4 | 30697 | 0 | -118.3 | -118.3 | -118.0 | -118.5 | -0.085 |
## 5 | 30697 | 0 | 4.9 | 5.0 | 11.0 | 0.0 | 0.199 |
## 6 | 30697 | 0 | 2.5 | 3.0 | 7.0 | 1.0 | 0.055 |
## 7 | 30697 | 0 | 0.1 | 0.0 | 1.0 | 0.0 | 1.999 |
## 8 | 30697 | 0 | 28.4 | 28.0 | 59.0 | 0.0 | 0.031 |
## 9 | 30673 | 24 | 13.4 | 15.0 | 60.0 | 0.0 | -0.036 |
## 10 | 25697 | 5000 | 0.4 | 0.0 | 1.0 | 0.0 | 0.217 |
## 11 | 30697 | 0 | 15349.0 | 15349.0 | 30697.0 | 1.0 | 0.000 |
## 12 | 30697 | 0 | 1387.5 | 1381.0 | 2874.0 | -2146.0 | -0.074 |
## 13 | 30697 | 0 | 0.7 | 0.0 | 24.8 | -25.0 | -0.085 |
## 14 | 30655 | 42 | 9.0 | 7.4 | 52.8 | -4.4 | 0.559 |
## 15 | 30697 | 0 | 0.7 | 0.0 | 24.0 | -25.0 | -0.082 |
## 16 | 30655 | 42 | 8.7 | 7.0 | 52.0 | -4.0 | 0.596 |
## 17 | 30673 | 24 | 13.4 | 15.0 | 60.0 | 0.0 | -0.036 |
## 18 | 30655 | 42 | 3.8 | 0.0 | 180.0 | -174.3 | -0.031 |
## 19 | 30655 | 42 | 3.8 | 0.0 | 180.0 | -174.0 | -0.029 |
## 20 | 30673 | 24 | 13.4 | 15.0 | 60.0 | 0.0 | -0.036 |
## 21 | 25697 | 5000 | 1.0 | 0.0 | 3.0 | 0.0 | 0.390 |
## 22 | 30697 | 0 | 2005.4 | 2005.0 | 2015.0 | 1996.0 | 0.047 |
plot(x_ft ~ y_ft, shots, col = "blue")
plot(x_ft ~ y_ft, shots, col = shot_distance)
plot(x_ft ~ y_ft, shots, col = shot_zone_area)
plot(x_ft ~ y_ft, shots, col = shot_zone_range)
plot(x_ft ~ y_ft, shots, col = shot_made_flag+2)
plot(x_ft ~ y_ft, data = shots[shots$shot_made_flag == 1,],
col = "darkgreen",
xlim = c(-10, 60))
plot(x_ft ~ y_ft, data = shots[shots$shot_made_flag == 0,],
col = "grey",
xlim = c(-10, 60))
plot(x_ft ~ y_ft, shots, col = shot_type)
plot(rad ~ ang, data = shots, col = shot_made_flag+2)
boxplot(shot_distance ~ shot_type, data = shots)
boxplot(shot_distance ~ shot_good, data = shots)
spineplot(factor(shot_good) ~ shot_type,
data = shots,
col = c("#41ae76", "#8c96c6"))
spineplot(factor(shot_good) ~ shot_distance,
data = shots,
col = c("#41ae76", "#8c96c6"))
spineplot(factor(shot_good) ~ shot_zone_area,
data = shots,
col = c("#41ae76", "#8c96c6"))
spineplot(factor(shot_good) ~ shot_zone_range,
data = shots,
col = c("#41ae76", "#8c96c6"))
spineplot(factor(shot_good) ~ ang,
data = shots,
col = c("#41ae76", "#8c96c6"))
spineplot(factor(shot_good) ~ rad,
data = shots,
col = c("#41ae76", "#8c96c6"))